In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
In [2]:
df=pd.read_csv('airbnb.csv')
In [3]:
df.head()
Out[3]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 NaN NaN 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 0.10 1 0
In [4]:
df.describe()
Out[4]:
id host_id latitude longitude price minimum_nights number_of_reviews reviews_per_month calculated_host_listings_count availability_365
count 4.889500e+04 4.889500e+04 48895.000000 48895.000000 48895.000000 48895.000000 48895.000000 38843.000000 48895.000000 48895.000000
mean 1.901714e+07 6.762001e+07 40.728949 -73.952170 152.720687 7.029962 23.274466 1.373221 7.143982 112.781327
std 1.098311e+07 7.861097e+07 0.054530 0.046157 240.154170 20.510550 44.550582 1.680442 32.952519 131.622289
min 2.539000e+03 2.438000e+03 40.499790 -74.244420 0.000000 1.000000 0.000000 0.010000 1.000000 0.000000
25% 9.471945e+06 7.822033e+06 40.690100 -73.983070 69.000000 1.000000 1.000000 0.190000 1.000000 0.000000
50% 1.967728e+07 3.079382e+07 40.723070 -73.955680 106.000000 3.000000 5.000000 0.720000 1.000000 45.000000
75% 2.915218e+07 1.074344e+08 40.763115 -73.936275 175.000000 5.000000 24.000000 2.020000 2.000000 227.000000
max 3.648724e+07 2.743213e+08 40.913060 -73.712990 10000.000000 1250.000000 629.000000 58.500000 327.000000 365.000000
In [5]:
df.dtypes
Out[5]:
id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object
In [6]:
df.shape
Out[6]:
(48895, 16)
In [7]:
## to count the number of missing values-

missing_count=df.isnull().sum()
print(missing_count)
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64
In [8]:
df.duplicated().sum()
Out[8]:
0
In [9]:
df.isna().sum()
Out[9]:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

EDA¶

In [10]:
columns_with_missing = ['last_review', 'reviews_per_month']
# Filter rows where these columns have missing values
missing_values_df = df[df[columns_with_missing].isnull().any(axis=1)]

# Display a glimpse of the rows with missing values
print(missing_values_df[columns_with_missing])
      last_review  reviews_per_month
2             NaN                NaN
19            NaN                NaN
26            NaN                NaN
36            NaN                NaN
38            NaN                NaN
...           ...                ...
48890         NaN                NaN
48891         NaN                NaN
48892         NaN                NaN
48893         NaN                NaN
48894         NaN                NaN

[10052 rows x 2 columns]
In [11]:
# this can help us in determining how to fill the missing values
unique_dates = df['last_review'].unique()
print(unique_dates)

#######
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')

max_year = df['last_review'].dt.year.max()
min_year = df['last_review'].dt.year.min()

print(f"Max Year: {max_year}")
print(f"Min Year: {min_year}")
['2018-10-19' '2019-05-21' nan ... '2017-12-23' '2018-01-29' '2018-03-29']
Max Year: 2019.0
Min Year: 2011.0
In [12]:
#clean the 'last_review' column
df['last_review'] = pd.to_datetime(df['last_review'])

min_date= df['last_review'].min()
max_date= df['last_review'].max()
all_dates = pd.date_range(min_date, max_date, freq='D')
In [13]:
df['last_review'] = pd.to_datetime(df['last_review'])

df['Year'] = df['last_review'].dt.year
sns.relplot(x='Year', y='reviews_per_month', data=df, kind="line", ci=None)
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\axisgrid.py:848: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  func(*plot_args, **plot_kwargs)
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
Out[13]:
<seaborn.axisgrid.FacetGrid at 0x1660298d310>
No description has been provided for this image
In [14]:
#drop the column Year because we don't need it anymore
# Assuming 'Year' is the column you want to drop
df.drop(columns='Year', inplace=True)
In [15]:
# no clear trends!
# so i can fill the missing values of column last_review: mean/mode OR Forward/Backward Fill
# i will choose mean, because I will not sort the dataframe by dates and its not a time series application.
mean_last_rev = df['last_review'].mean()
df['last_review'].fillna(value=mean_last_rev, inplace=True)
In [16]:
#understand the distribution first
plt.hist(df['reviews_per_month'], bins=30, color='skyblue', edgecolor='black')
plt.title('Histogram of reviews_per_month')
plt.xlabel('reviews_per_month')
plt.ylabel('Frequency')
plt.show()

# the distribution is right-skewed
No description has been provided for this image
In [17]:
mean_value = df['reviews_per_month'].mean()
median_value = df['reviews_per_month'].median()
mode_value = df['reviews_per_month'].mode().iloc[0]
std_dev = df['reviews_per_month'].std()

print(f"Mean: {mean_value}, Median: {median_value}, Mode: {mode_value}, Std Dev: {std_dev}")
Mean: 1.3732214298586618, Median: 0.72, Mode: 0.02, Std Dev: 1.6804419952744627
In [18]:
# I will fill the NaN values by median because of the distribution
df['reviews_per_month'].fillna(value=df['reviews_per_month'].median(), inplace=True)
In [19]:
# Dropping rows with missing values in 'name' and 'host_name' due to small percentage of data
df.dropna(subset=['name', 'host_name'], inplace=True)
In [20]:
df
Out[20]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365
0 2539 Clean & quiet apt home by the park 2787 John Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 2018-10-19 00:00:00.000000000 0.21 6 365
1 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 2019-05-21 00:00:00.000000000 0.38 2 355
2 3647 THE VILLAGE OF HARLEM....NEW YORK ! 4632 Elisabeth Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 2018-10-04 01:47:23.910099456 0.72 1 365
3 3831 Cozy Entire Floor of Brownstone 4869 LisaRoxanne Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 2019-07-05 00:00:00.000000000 4.64 1 194
4 5022 Entire Apt: Spacious Studio/Loft by central park 7192 Laura Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 2018-11-19 00:00:00.000000000 0.10 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
48890 36484665 Charming one bedroom - newly renovated rowhouse 8232441 Sabrina Brooklyn Bedford-Stuyvesant 40.67853 -73.94995 Private room 70 2 0 2018-10-04 01:47:23.910099456 0.72 2 9
48891 36485057 Affordable room in Bushwick/East Williamsburg 6570630 Marisol Brooklyn Bushwick 40.70184 -73.93317 Private room 40 4 0 2018-10-04 01:47:23.910099456 0.72 2 36
48892 36485431 Sunny Studio at Historical Neighborhood 23492952 Ilgar & Aysel Manhattan Harlem 40.81475 -73.94867 Entire home/apt 115 10 0 2018-10-04 01:47:23.910099456 0.72 1 27
48893 36485609 43rd St. Time Square-cozy single bed 30985759 Taz Manhattan Hell's Kitchen 40.75751 -73.99112 Shared room 55 1 0 2018-10-04 01:47:23.910099456 0.72 6 2
48894 36487245 Trendy duplex in the very heart of Hell's Kitchen 68119814 Christophe Manhattan Hell's Kitchen 40.76404 -73.98933 Private room 90 7 0 2018-10-04 01:47:23.910099456 0.72 1 23

48858 rows × 16 columns

In [21]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 48858 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              48858 non-null  int64         
 1   name                            48858 non-null  object        
 2   host_id                         48858 non-null  int64         
 3   host_name                       48858 non-null  object        
 4   neighbourhood_group             48858 non-null  object        
 5   neighbourhood                   48858 non-null  object        
 6   latitude                        48858 non-null  float64       
 7   longitude                       48858 non-null  float64       
 8   room_type                       48858 non-null  object        
 9   price                           48858 non-null  int64         
 10  minimum_nights                  48858 non-null  int64         
 11  number_of_reviews               48858 non-null  int64         
 12  last_review                     48858 non-null  datetime64[ns]
 13  reviews_per_month               48858 non-null  float64       
 14  calculated_host_listings_count  48858 non-null  int64         
 15  availability_365                48858 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(7), object(5)
memory usage: 6.3+ MB
In [22]:
# to check the unique values for neighourhood group and room type 
print(df['neighbourhood_group'].unique())
print(df['room_type'].unique())
['Brooklyn' 'Manhattan' 'Queens' 'Staten Island' 'Bronx']
['Private room' 'Entire home/apt' 'Shared room']
In [23]:
#to check unique values for the the neighourhood
df['neighbourhood'].unique()
Out[23]:
array(['Kensington', 'Midtown', 'Harlem', 'Clinton Hill', 'East Harlem',
       'Murray Hill', 'Bedford-Stuyvesant', "Hell's Kitchen",
       'Upper West Side', 'Chinatown', 'South Slope', 'West Village',
       'Williamsburg', 'Fort Greene', 'Chelsea', 'Crown Heights',
       'Park Slope', 'Windsor Terrace', 'Inwood', 'East Village',
       'Greenpoint', 'Bushwick', 'Flatbush', 'Lower East Side',
       'Prospect-Lefferts Gardens', 'Long Island City', 'Kips Bay',
       'SoHo', 'Upper East Side', 'Prospect Heights',
       'Washington Heights', 'Woodside', 'Brooklyn Heights',
       'Carroll Gardens', 'Gowanus', 'Flatlands', 'Cobble Hill',
       'Flushing', 'Boerum Hill', 'Sunnyside', 'DUMBO', 'St. George',
       'Highbridge', 'Financial District', 'Ridgewood',
       'Morningside Heights', 'Jamaica', 'Middle Village', 'NoHo',
       'Ditmars Steinway', 'Flatiron District', 'Roosevelt Island',
       'Greenwich Village', 'Little Italy', 'East Flatbush',
       'Tompkinsville', 'Astoria', 'Clason Point', 'Eastchester',
       'Kingsbridge', 'Two Bridges', 'Rockaway Beach', 'Forest Hills',
       'Nolita', 'Woodlawn', 'University Heights', 'Gravesend',
       'Gramercy', 'Allerton', 'East New York', 'Theater District',
       'Concourse Village', 'Sheepshead Bay', 'Emerson Hill',
       'Fort Hamilton', 'Bensonhurst', 'Tribeca', 'Shore Acres',
       'Sunset Park', 'Concourse', 'Elmhurst', 'Brighton Beach',
       'Jackson Heights', 'Cypress Hills', 'St. Albans', 'Arrochar',
       'Rego Park', 'Wakefield', 'Clifton', 'Bay Ridge', 'Graniteville',
       'Spuyten Duyvil', 'Stapleton', 'Briarwood', 'Ozone Park',
       'Columbia St', 'Vinegar Hill', 'Mott Haven', 'Longwood',
       'Canarsie', 'Battery Park City', 'Civic Center', 'East Elmhurst',
       'New Springville', 'Morris Heights', 'Arverne', 'Cambria Heights',
       'Tottenville', 'Mariners Harbor', 'Concord', 'Borough Park',
       'Bayside', 'Downtown Brooklyn', 'Port Morris', 'Fieldston',
       'Kew Gardens', 'Midwood', 'College Point', 'Mount Eden',
       'City Island', 'Glendale', 'Port Richmond', 'Red Hook',
       'Richmond Hill', 'Queens Village', 'Bellerose', 'Maspeth',
       'Williamsbridge', 'Soundview', 'Woodhaven', 'Woodrow',
       'Co-op City', 'Stuyvesant Town', 'Parkchester', 'North Riverdale',
       'Dyker Heights', 'Bronxdale', 'Sea Gate', 'Riverdale',
       'Kew Gardens Hills', 'Bay Terrace', 'Norwood', 'Claremont Village',
       'Whitestone', 'Fordham', 'Bayswater', 'Navy Yard', 'Brownsville',
       'Eltingville', 'Fresh Meadows', 'Mount Hope', 'Lighthouse Hill',
       'Springfield Gardens', 'Howard Beach', 'Belle Harbor',
       'Jamaica Estates', 'Van Nest', 'Morris Park', 'West Brighton',
       'Far Rockaway', 'South Ozone Park', 'Tremont', 'Corona',
       'Great Kills', 'Manhattan Beach', 'Marble Hill', 'Dongan Hills',
       'Castleton Corners', 'East Morrisania', 'Hunts Point', 'Neponsit',
       'Pelham Bay', 'Randall Manor', 'Throgs Neck', 'Todt Hill',
       'West Farms', 'Silver Lake', 'Morrisania', 'Laurelton',
       'Grymes Hill', 'Holliswood', 'Pelham Gardens', 'Belmont',
       'Rosedale', 'Edgemere', 'New Brighton', 'Midland Beach',
       'Baychester', 'Melrose', 'Bergen Beach', 'Richmondtown',
       'Howland Hook', 'Schuylerville', 'Coney Island', 'New Dorp Beach',
       "Prince's Bay", 'South Beach', 'Bath Beach', 'Jamaica Hills',
       'Oakwood', 'Castle Hill', 'Hollis', 'Douglaston', 'Huguenot',
       'Olinville', 'Edenwald', 'Grant City', 'Westerleigh',
       'Bay Terrace, Staten Island', 'Westchester Square', 'Little Neck',
       'Fort Wadsworth', 'Rosebank', 'Unionport', 'Mill Basin',
       'Arden Heights', "Bull's Head", 'New Dorp', 'Rossville',
       'Breezy Point', 'Willowbrook'], dtype=object)

Visualization¶

In [24]:
sns.countplot(x='neighbourhood_group',data=df,palette='rocket')
plt.title('neighbourhood_group')
Out[24]:
Text(0.5, 1.0, 'neighbourhood_group')
No description has been provided for this image
In [25]:
plt.title('room_type')
sns.countplot(x='room_type',data=df,palette='mako')
Out[25]:
<Axes: title={'center': 'room_type'}, xlabel='room_type', ylabel='count'>
No description has been provided for this image
In [26]:
plt.figure(figsize=(6,8),dpi=80)
plt.title('neighbourhood_group')
sns.barplot(x='neighbourhood_group',y='number_of_reviews',data=df,palette='muted')
Out[26]:
<Axes: title={'center': 'neighbourhood_group'}, xlabel='neighbourhood_group', ylabel='number_of_reviews'>
No description has been provided for this image
In [27]:
plt.figure(figsize=(6,8),dpi=72)
plt.title('room_type')
sns.barplot(x='room_type',y='number_of_reviews',data=df,palette='rocket')
Out[27]:
<Axes: title={'center': 'room_type'}, xlabel='room_type', ylabel='number_of_reviews'>
No description has been provided for this image
In [28]:
plt.figure(figsize=(10, 6))
sns.countplot(x='neighbourhood_group', hue='room_type', data=df, palette='Spectral')
plt.title('Neighbourhood Group vs Room Type')
plt.xlabel('Neighbourhood Group')
plt.ylabel('Count')
plt.show()
No description has been provided for this image
In [29]:
sns.catplot(x='room_type', y='price', data=df, palette='rocket')
C:\Users\Mayank\AppData\Local\Temp\ipykernel_8536\2459576386.py:1: FutureWarning: Passing `palette` without assigning `hue` is deprecated.
  sns.catplot(x='room_type', y='price', data=df, palette='rocket')
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
Out[29]:
<seaborn.axisgrid.FacetGrid at 0x1660296e8d0>
No description has been provided for this image
In [30]:
import plotly.express as px
# Create a scatter mapbox visualization
fig = px.scatter_mapbox(
    df,
    lat="latitude",
    lon="longitude",
    color="room_type",
    size="price",
    size_max=10,
    mapbox_style="carto-positron",
    hover_name="neighbourhood_group",
    hover_data=["price"],
    title="Room Types in NYC"
)

# Show the visualization
fig.show()

univariate analysis¶

In [31]:
#for the numerical data in df
df.describe().T
Out[31]:
count mean min 25% 50% 75% max std
id 48858.0 19023349.934565 2539.0 9475979.75 19691143.5 29157648.25 36487245.0 10982893.614232
host_id 48858.0 67631688.285951 2438.0 7818668.75 30791331.0 107434423.0 274321313.0 78623888.992733
latitude 48858.0 40.728941 40.49979 40.69009 40.72307 40.763107 40.91306 0.054528
longitude 48858.0 -73.95217 -74.24442 -73.98307 -73.95568 -73.93628 -73.71299 0.046159
price 48858.0 152.740309 0.0 69.0 106.0 175.0 10000.0 240.232386
minimum_nights 48858.0 7.012444 1.0 1.0 3.0 5.0 1250.0 20.019757
number_of_reviews 48858.0 23.273098 0.0 1.0 5.0 24.0 629.0 44.549898
last_review 48858 2018-10-04 06:14:40.804078336 2011-03-28 00:00:00 2018-10-04 01:47:23.910099456 2019-01-03 00:00:00 2019-06-19 00:00:00 2019-07-08 00:00:00 NaN
reviews_per_month 48858.0 1.239035 0.01 0.28 0.72 1.58 58.5 1.520889
calculated_host_listings_count 48858.0 7.148369 1.0 1.0 1.0 2.0 327.0 32.9646
availability_365 48858.0 112.801425 0.0 0.0 45.0 227.0 365.0 131.610962

we will focus on price in numerical data¶

In [32]:
sns.histplot(data=df['price'])
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

Out[32]:
<Axes: xlabel='price', ylabel='Count'>
No description has been provided for this image
In [33]:
sns.boxplot(data=df, y='price', color='lightgreen')
plt.title('Boxplot of Price')
plt.show()
No description has been provided for this image

for categorical value¶

In [34]:
df['room_type'].value_counts().plot(kind='pie',autopct='%.2f')
#df['neighbourhood_group'].value_counts().plot(kind='pie',autopct='%.2f')
Out[34]:
<Axes: ylabel='count'>
No description has been provided for this image
In [35]:
df['neighbourhood_group'].value_counts().plot(kind='pie',autopct='%.2f')
Out[35]:
<Axes: ylabel='count'>
No description has been provided for this image

Bivariate analysis¶

In [36]:
sns.scatterplot(data=df, x='price', y='minimum_nights')
plt.title('Price vs Minimum Nights')
plt.show()
No description has been provided for this image
In [37]:
sns.scatterplot(data=df, x='price', y='number_of_reviews')
plt.title('Price vs number_of_reviews')
plt.show()
No description has been provided for this image
In [38]:
sns.scatterplot(data=df, y='price', x='calculated_host_listings_count')
plt.title('Price vs calculated_host_listings_count')
plt.show()
No description has been provided for this image
In [39]:
sns.scatterplot(data=df, y='price', x='reviews_per_month')
plt.title('Price vs reviews_per_month')
plt.show()
No description has been provided for this image
In [40]:
sns.scatterplot(data=df, y='price', x='availability_365')
plt.title('Price vs availability_365')
plt.show()
No description has been provided for this image
In [41]:
nyc=df.drop(columns=['host_name','last_review','neighbourhood'])

# Convert categorical columns to numeric using label encoding

from sklearn.preprocessing import LabelEncoder
# Label encoding for neighbourhood_group
le_neighbourhood = LabelEncoder()
nyc['neighbourhood_group'] = le_neighbourhood.fit_transform(nyc['neighbourhood_group'])

# Label encoding for room_type
le_room_type = LabelEncoder()
nyc['room_type'] = le_room_type.fit_transform(nyc['room_type'])

# Calculate correlation matrix
numeric_columns = nyc.select_dtypes(include=[np.number])  # Select only numeric columns

correlation_matrix = numeric_columns.corr()
#print(correlation_matrix)

#heatmap
plt.figure(figsize=(10, 8))  # Adjust size as needed
sns.heatmap(correlation_matrix, 
            annot=True,           # Annotate with correlation coefficients
            fmt=".2f",            # Format for annotations
            cmap='viridis',      # Colormap
            square=True,          # Make each cell square
            cbar_kws={"shrink": 0.8})  # Shrink the color bar
plt.title("Correlation Matrix Heatmap")
plt.show()
No description has been provided for this image
In [42]:
df.groupby('neighbourhood_group')['price'].mean().plot(kind='bar', color='lightblue')
plt.title('Average Price by Neighbourhood Group')
plt.ylabel('Average Price')
plt.show()
No description has been provided for this image
In [43]:
sns.boxplot(data=df, x='room_type', y='price')
plt.title('Price Distribution by Room Type')
plt.show()
No description has been provided for this image
In [44]:
pd.crosstab(df['room_type'], df['neighbourhood_group']).plot(kind='bar', stacked=True)
plt.title('Room Type Distribution Across Neighbourhood Groups')
plt.show()
No description has been provided for this image

Multivariate analysis¶

In [45]:
sns.pairplot(df[['price', 'minimum_nights', 'availability_365', 'reviews_per_month']])
plt.show()
C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\Users\Mayank\anaconda3034\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

No description has been provided for this image
In [46]:
nyc_model = df.select_dtypes(include=[np.number])  # Select only numeric columns
corr = nyc_model.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='cividis')
plt.title('Correlation Between Variables')
plt.show()
No description has been provided for this image
In [47]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
features = df[['price', 'minimum_nights', 'availability_365']].dropna()
kmeans.fit(features)
features['cluster'] = kmeans.labels_
sns.scatterplot(x=features['price'], y=features['minimum_nights'], hue=features['cluster'])
plt.show()
No description has been provided for this image

MULTICOLLINEARTY¶

RESIDUAL PLOT¶

GRIDSEARCH¶

In [48]:
df=df.drop(columns=['host_name','last_review','neighbourhood'])

# Convert categorical columns to numeric using label encoding

from sklearn.preprocessing import LabelEncoder
# Label encoding for neighbourhood_group
le_neighbourhood = LabelEncoder()
df['neighbourhood_group'] = le_neighbourhood.fit_transform(df['neighbourhood_group'])

# Label encoding for room_type
le_room_type = LabelEncoder()
df['room_type'] = le_room_type.fit_transform(df['room_type'])
In [49]:
# Step 2: Select features and the target variable
features = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'availability_365','neighbourhood_group','room_type']
X = df[features]
y = df['price']
In [61]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 48858 entries, 0 to 48894
Data columns (total 13 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48858 non-null  int64  
 1   name                            48858 non-null  object 
 2   host_id                         48858 non-null  int64  
 3   neighbourhood_group             48858 non-null  int32  
 4   latitude                        48858 non-null  float64
 5   longitude                       48858 non-null  float64
 6   room_type                       48858 non-null  int32  
 7   price                           48858 non-null  int64  
 8   minimum_nights                  48858 non-null  int64  
 9   number_of_reviews               48858 non-null  int64  
 10  reviews_per_month               48858 non-null  float64
 11  calculated_host_listings_count  48858 non-null  int64  
 12  availability_365                48858 non-null  int64  
dtypes: float64(3), int32(2), int64(7), object(1)
memory usage: 4.8+ MB
In [65]:
df.drop('name',axis=1,inplace=True)

STANDARDIZATION¶

In [66]:
sc=StandardScaler()
df=sc.fit_transform(df)
In [67]:
# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [68]:
# Step 4: Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
Out[68]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [69]:
# Step 5: Generate the residual plot
y_pred = model.predict(X_test)
residuals = y_test - y_pred

plt.figure(figsize=(15, 7))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(0,color='red', linestyle='--' , linewidth=1.2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
No description has been provided for this image
In [70]:
## To assess multicollinearity, we calculate the Variance Inflation Factor (VIF) for each feature. 
# Step 6: Calculate Variance Inflation Factor (VIF)
vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
In [71]:
print("Variance Inflation Factor (VIF):")
print(vif_data)
Variance Inflation Factor (VIF):
               Feature            VIF
0             latitude  473038.081597
1            longitude  471977.141989
2       minimum_nights       1.167698
3    number_of_reviews       1.327913
4     availability_365       1.850077
5  neighbourhood_group       6.794383
6            room_type       1.880712
In [72]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
In [73]:
# Define the model
model = RandomForestRegressor(random_state=42)

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
In [58]:
# Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best Parameters
print("Best Parameters:", grid_search.best_params_)
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
In [59]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
In [82]:
import warnings
import time
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler
from sklearn.impute import KNNImputer
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from ydata_profiling import ProfileReport
#pip install ydata-profiling

MODEL BUILDING¶

In [83]:
models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('RF', RandomForestRegressor()),
          #('SVR', SVR()),
          ('GBM', GradientBoostingRegressor()),
          ("XGBoost", XGBRegressor(objective='reg:squarederror'))]

rmse_scores = []
r2_scores = []
mae_scores = []
mse_scores = []
execution_times = []

for name, regressor in models:
    start_time = time.time()

    # Fit the model
    regressor.fit(X_train, y_train)

    # Make predictions
    y_pred = regressor.predict(X_test)

    # Calculate RMSE
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv=5, scoring="neg_mean_squared_error")))
    rmse_scores.append(rmse)
    
    # Calculate R^2 score
    r2 = metrics.r2_score(y_test, y_pred)
    r2_scores.append(r2)

    # Calculate MAE
    mae = metrics.mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    # Calculate MSE
    mse = metrics.mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

    # Calculate the execution time of the model
    execution_time = time.time() - start_time
    execution_times.append(execution_time)

    print(f"RMSE: {round(rmse, 4)} ({name})")
    print(f"R^2 Score: {round(r2, 4)} ({name})")
    print(f"MAE: {round(mae, 4)} ({name})")
    print(f"MSE: {round(mse, 4)} ({name})")
    print(f"Execution Time: {round(execution_time, 2)} seconds\n")

# Plot RMSE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], rmse_scores)
plt.xlabel("Model")
plt.ylabel("RMSE")
plt.title("Model Performance (RMSE)")
plt.show()

# Plot R^2 scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], r2_scores)
plt.xlabel("Model")
plt.ylabel("R^2 Score")
plt.title("Model Performance (R^2 Score)")
plt.show()

# Plot MAE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], mae_scores)
plt.xlabel("Model")
plt.ylabel("MAE")
plt.title("Model Performance (MAE)")
plt.show()

# Plot MSE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], mse_scores)
plt.xlabel("Model")
plt.ylabel("MSE")
plt.title("Model Performance (MSE)")
plt.show()

# Plot execution times
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], execution_times)
plt.xlabel("Execution Time (seconds)")
plt.ylabel("Model")
plt.title("Execution Times for Different Models")
plt.show()
RMSE: 227.9741 (LR)
R^2 Score: 0.0925 (LR)
MAE: 73.5269 (LR)
MSE: 45800.715 (LR)
Execution Time: 0.22 seconds

RMSE: 227.9735 (Ridge)
R^2 Score: 0.0925 (Ridge)
MAE: 73.4931 (Ridge)
MSE: 45798.9203 (Ridge)
Execution Time: 0.11 seconds

RMSE: 229.2451 (Lasso)
R^2 Score: 0.0838 (Lasso)
MAE: 74.5979 (Lasso)
MSE: 46236.5532 (Lasso)
Execution Time: 0.11 seconds

RMSE: 232.9166 (ElasticNet)
R^2 Score: 0.0541 (ElasticNet)
MAE: 81.2278 (ElasticNet)
MSE: 47734.9972 (ElasticNet)
Execution Time: 0.11 seconds

RMSE: 252.8757 (KNN)
R^2 Score: -0.1436 (KNN)
MAE: 88.7902 (KNN)
MSE: 57715.8617 (KNN)
Execution Time: 2.0 seconds

RMSE: 348.8585 (CART)
R^2 Score: -1.0991 (CART)
MAE: 88.899 (CART)
MSE: 105935.0795 (CART)
Execution Time: 2.61 seconds

RMSE: 239.1098 (RF)
R^2 Score: -0.0313 (RF)
MAE: 69.6856 (RF)
MSE: 52045.4296 (RF)
Execution Time: 168.63 seconds

RMSE: 224.9954 (GBM)
R^2 Score: 0.1348 (GBM)
MAE: 66.5429 (GBM)
MSE: 43664.9368 (GBM)
Execution Time: 29.19 seconds

RMSE: 237.2593 (XGBoost)
R^2 Score: 0.039 (XGBoost)
MAE: 70.0264 (XGBoost)
MSE: 48497.3853 (XGBoost)
Execution Time: 1.49 seconds

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

HYPERPARAMETER TUNING¶

In [84]:
# Initialize the models
models = [('LR', LinearRegression()),
          ("Ridge", Ridge()),
          ("Lasso", Lasso()),
          ("ElasticNet", ElasticNet()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor()),
          ('RF', RandomForestRegressor()),
          #('SVR', SVR()),
          ('GBM', GradientBoostingRegressor()),
          ("XGBoost", XGBRegressor(objective='reg:squarederror'))]

# Initialize lists to store metrics
rmse_scores = []
r2_scores = []
mae_scores = []
mse_scores = []
execution_times = []

# Define the hyperparameters for each model
param_grids = {
    'LR': {},
    'Ridge': {'alpha': [0.1, 1.0]},
    'Lasso': {'alpha': [0.1, 1.0]},
    'ElasticNet': {'alpha': [0.1, 1.0], 'l1_ratio': [0.1, 0.9]},
    'KNN': {'n_neighbors': [3, 5]},
    'CART': {'max_depth': [None, 10], 'min_samples_leaf': [1, 2]},
    'RF': {'n_estimators': [10, 50], 'max_depth': [None, 10]},
    'GBM': {'n_estimators': [10, 50], 'learning_rate': [0.01, 0.1]},
    'XGBoost': {'n_estimators': [10, 50], 'learning_rate': [0.01, 0.1]}}


# Train and evaluate the models with hyperparameter tuning
for name, regressor in models:
    print(f"Hyperparameter Tuning for {name}:")
    start_time = time.time()

    if param_grids[name]:
        grid_search = GridSearchCV(regressor, param_grid=param_grids[name], cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_

        print(f"Best parameters: {grid_search.best_params_}")
    else:
        best_model = regressor.fit(X_train, y_train)

    # Make predictions
    y_pred = best_model.predict(X_test)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

    # Calculate R^2 score
    r2 = r2_score(y_test, y_pred)
    r2_scores.append(r2)

    # Calculate MAE
    mae = mean_absolute_error(y_test, y_pred)
    mae_scores.append(mae)

    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    mse_scores.append(mse)

    # Calculate the execution time of the model
    execution_time = time.time() - start_time
    execution_times.append(execution_time)

    print(f"RMSE: {round(rmse, 4)} ({name})")
    print(f"R^2 Score: {round(r2, 4)} ({name})")
    print(f"MAE: {round(mae, 4)} ({name})")
    print(f"MSE: {round(mse, 4)} ({name})")
    print(f"Execution Time: {round(execution_time, 2)} seconds\n")

# Plot RMSE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], rmse_scores)
plt.xlabel("Model")
plt.ylabel("RMSE")
plt.title("Model Performance (RMSE)")
plt.show()

# Plot R^2 scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], r2_scores)
plt.xlabel("Model")
plt.ylabel("R^2 Score")
plt.title("Model Performance (R^2 Score)")
plt.show()

# Plot MAE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], mae_scores)
plt.xlabel("Model")
plt.ylabel("MAE")
plt.title("Model Performance (MAE)")
plt.show()

# Plot MSE scores
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], mse_scores)
plt.xlabel("Model")
plt.ylabel("MSE")
plt.title("Model Performance (MSE)")
plt.show()

# Plot execution times
plt.figure(figsize=(10, 10))
plt.bar([name for name, _ in models], execution_times)
plt.xlabel("Execution Time (seconds)")
plt.ylabel("Model")
plt.title("Execution Times for Different Models")
plt.show()
Hyperparameter Tuning for LR:
RMSE: 214.011 (LR)
R^2 Score: 0.0925 (LR)
MAE: 73.5269 (LR)
MSE: 45800.715 (LR)
Execution Time: 0.02 seconds

Hyperparameter Tuning for Ridge:
Best parameters: {'alpha': 0.1}
RMSE: 214.0106 (Ridge)
R^2 Score: 0.0925 (Ridge)
MAE: 73.5234 (Ridge)
MSE: 45800.5211 (Ridge)
Execution Time: 6.35 seconds

Hyperparameter Tuning for Lasso:
Best parameters: {'alpha': 0.1}
RMSE: 213.9908 (Lasso)
R^2 Score: 0.0926 (Lasso)
MAE: 73.2759 (Lasso)
MSE: 45792.0777 (Lasso)
Execution Time: 0.25 seconds

Hyperparameter Tuning for ElasticNet:
Best parameters: {'alpha': 0.1, 'l1_ratio': 0.9}
RMSE: 215.1073 (ElasticNet)
R^2 Score: 0.0831 (ElasticNet)
MAE: 74.7703 (ElasticNet)
MSE: 46271.1676 (ElasticNet)
Execution Time: 0.47 seconds

Hyperparameter Tuning for KNN:
Best parameters: {'n_neighbors': 5}
RMSE: 240.2413 (KNN)
R^2 Score: -0.1436 (KNN)
MAE: 88.7902 (KNN)
MSE: 57715.8617 (KNN)
Execution Time: 1.69 seconds

Hyperparameter Tuning for CART:
Best parameters: {'max_depth': 10, 'min_samples_leaf': 2}
RMSE: 239.1553 (CART)
R^2 Score: -0.1333 (CART)
MAE: 68.4253 (CART)
MSE: 57195.2403 (CART)
Execution Time: 1.53 seconds

Hyperparameter Tuning for RF:
Best parameters: {'max_depth': 10, 'n_estimators': 50}
RMSE: 220.8294 (RF)
R^2 Score: 0.0337 (RF)
MAE: 67.5705 (RF)
MSE: 48765.6126 (RF)
Execution Time: 29.89 seconds

Hyperparameter Tuning for GBM:
Best parameters: {'learning_rate': 0.1, 'n_estimators': 50}
RMSE: 208.7253 (GBM)
R^2 Score: 0.1367 (GBM)
MAE: 66.4528 (GBM)
MSE: 43566.2706 (GBM)
Execution Time: 10.76 seconds

Hyperparameter Tuning for XGBoost:
Best parameters: {'learning_rate': 0.1, 'n_estimators': 10}
RMSE: 214.8463 (XGBoost)
R^2 Score: 0.0854 (XGBoost)
MAE: 70.1504 (XGBoost)
MSE: 46158.9199 (XGBoost)
Execution Time: 1.56 seconds

No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

BEST MODEL¶

In [85]:
best_model
Out[85]:
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=10, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=10, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
In [86]:
# Final Prediction Model
final_model = best_model

# Make predictions on the test set using the final model
y_final_pred = final_model.predict(X_test)
final_y_pred = (y_final_pred)
final_y_test =(y_test)
In [87]:
# Create a DataFrame with the predicted prices and true prices
results = pd.DataFrame({'Predicted Price': final_y_pred, 'True Price': final_y_test})

# Calculate the difference between the true prices and predicted prices and add a new column
results['Difference'] = results['True Price'] - results['Predicted Price']

# Display the results
print(results)
       Predicted Price  True Price  Difference
5880        177.967026         140  -37.967026
35926       236.515106         399  162.484894
16413       143.900497         117  -26.900497
23347        97.397881          25  -72.397881
2531        222.163651         145  -77.163651
...                ...         ...         ...
9154        189.075378         177  -12.075378
29973        94.300438          75  -19.300438
10901        98.460770         100    1.539230
4182         93.681686          71  -22.681686
43095       125.407310          70  -55.407310

[9772 rows x 3 columns]

FEATURE IMPORTANCE¶

In [90]:
def plot_importance(model, features, num=50, save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(8, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show(block=True)
    if save:
        plt.savefig('importances.png')

plot_importance(final_model, X)
No description has been provided for this image